{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Preprocessing for ECG Classification\n",
    "\n",
    "> Copyright 2019 Dave Fernandes. All Rights Reserved.\n",
    "> \n",
    "> Licensed under the Apache License, Version 2.0 (the \"License\");\n",
    "> you may not use this file except in compliance with the License.\n",
    "> You may obtain a copy of the License at\n",
    ">\n",
    "> http://www.apache.org/licenses/LICENSE-2.0\n",
    ">  \n",
    "> Unless required by applicable law or agreed to in writing, software\n",
    "> distributed under the License is distributed on an \"AS IS\" BASIS,\n",
    "> WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
    "> See the License for the specific language governing permissions and\n",
    "> limitations under the License."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Data can be downloaded from: https://www.kaggle.com/shayanfazeli/heartbeat\n",
    "\n",
    "- Randomly sample 100 of each class of time-series for the test set. This is just over 10% of the samples in the smallest class.\n",
    "- Remaining data is balanced for the training set by upsampling under-represented classes."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import pickle\n",
    "\n",
    "CSV_1 = './Data/mitbih_train.csv'\n",
    "CSV_2 = './Data/mitbih_test.csv'\n",
    "\n",
    "TRAIN_SET = './Data/train_set.pickle'\n",
    "TEST_SET = './Data/test_set.pickle'\n",
    "\n",
    "raw_1 = pd.read_csv(CSV_1, header=None)\n",
    "raw_2 = pd.read_csv(CSV_2, header=None)\n",
    "raw = pd.concat([raw_1, raw_2], axis=0)\n",
    "\n",
    "shuffled = raw.sample(frac=1, axis=0)\n",
    "del raw\n",
    "del raw_1\n",
    "del raw_2\n",
    "\n",
    "values = shuffled.values\n",
    "x = values[:, :-1]\n",
    "y = values[:, -1].astype(int)\n",
    "del values\n",
    "del shuffled"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "TEST_CLASS_SIZE = 100\n",
    "\n",
    "class_x = []\n",
    "class_count = []\n",
    "\n",
    "for label in range(5):\n",
    "    x_i = x[y == label]\n",
    "    \n",
    "    # Take the first TEST_CLASS_SIZE elements for the test set\n",
    "    if label == 0:\n",
    "        x_test = x_i[:TEST_CLASS_SIZE, :]\n",
    "        y_test = np.zeros((TEST_CLASS_SIZE)).astype(int)\n",
    "    else:\n",
    "        x_test = np.concatenate((x_test, x_i[:TEST_CLASS_SIZE, :]), axis=0)\n",
    "        y_test = np.concatenate((y_test, np.zeros((TEST_CLASS_SIZE)).astype(int) + label))\n",
    "        \n",
    "    # Use the remainder of the elements for the training set\n",
    "    x_i = x_i[TEST_CLASS_SIZE:, :]\n",
    "    class_x.append(x_i)\n",
    "    class_count.append(len(x_i))\n",
    "\n",
    "# Compute the multiple of class elements needed to balance the classes\n",
    "counts = (np.floor(max(class_count) / np.array(class_count))).astype(int)\n",
    "print('Multiples:', counts)\n",
    "\n",
    "# Append repeated values for under-represented classes\n",
    "for label in range(5):\n",
    "    count = counts[label]\n",
    "    if label == 0:\n",
    "        x_bal = class_x[label]\n",
    "        y_bal = np.zeros((class_count[label])).astype(int)\n",
    "        count -= 1\n",
    "\n",
    "    for j in range(count):\n",
    "        x_bal = np.concatenate((x_bal, class_x[label]), axis=0)\n",
    "        y_bal = np.concatenate((y_bal, np.zeros((class_count[label])).astype(int) + label))\n",
    "\n",
    "print('Training set shapes:', np.shape(x_bal), np.shape(y_bal))\n",
    "print('Test set shapes:', np.shape(x_test), np.shape(y_test))\n",
    "\n",
    "with open(TEST_SET, 'wb') as file:\n",
    "    pickle.dump({'x': x_test, 'y': y_test}, file)\n",
    "\n",
    "with open(TRAIN_SET, 'wb') as file:\n",
    "    pickle.dump({'x': x_bal, 'y': y_bal}, file)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Next\n",
    "Run the `ClassifyECG.ipynb` notebook next..."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
